{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Adjustment for chance in clustering performance evaluation\n\n\nThe following plots demonstrate the impact of the number of clusters and\nnumber of samples on various clustering performance evaluation metrics.\n\nNon-adjusted measures such as the V-Measure show a dependency between\nthe number of clusters and the number of samples: the mean V-Measure\nof random labeling increases significantly as the number of clusters is\ncloser to the total number of samples used to compute the measure.\n\nAdjusted for chance measure such as ARI display some random variations\ncentered around a mean score of 0.0 for any number of samples and\nclusters.\n\nOnly adjusted measures can hence safely be used as a consensus index\nto evaluate the average stability of clustering algorithms for a given\nvalue of k on various overlapping sub-samples of the dataset.\n\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "print(__doc__)\n\n# Author: Olivier Grisel <olivier.grisel@ensta.org>\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom time import time\nfrom sklearn import metrics\n\ndef uniform_labelings_scores(score_func, n_samples, n_clusters_range,\n                             fixed_n_classes=None, n_runs=5, seed=42):\n    \"\"\"Compute score for 2 random uniform cluster labelings.\n\n    Both random labelings have the same number of clusters for each value\n    possible value in ``n_clusters_range``.\n\n    When fixed_n_classes is not None the first labeling is considered a ground\n    truth class assignment with fixed number of classes.\n    \"\"\"\n    random_labels = np.random.RandomState(seed).randint\n    scores = np.zeros((len(n_clusters_range), n_runs))\n\n    if fixed_n_classes is not None:\n        labels_a = random_labels(low=0, high=fixed_n_classes, size=n_samples)\n\n    for i, k in enumerate(n_clusters_range):\n        for j in range(n_runs):\n            if fixed_n_classes is None:\n                labels_a = random_labels(low=0, high=k, size=n_samples)\n            labels_b = random_labels(low=0, high=k, size=n_samples)\n            scores[i, j] = score_func(labels_a, labels_b)\n    return scores\n\n\ndef ami_score(U, V):\n    return metrics.adjusted_mutual_info_score(U, V)\n\nscore_funcs = [\n    metrics.adjusted_rand_score,\n    metrics.v_measure_score,\n    ami_score,\n    metrics.mutual_info_score,\n]\n\n# 2 independent random clusterings with equal cluster number\n\nn_samples = 100\nn_clusters_range = np.linspace(2, n_samples, 10).astype(np.int)\n\nplt.figure(1)\n\nplots = []\nnames = []\nfor score_func in score_funcs:\n    print(\"Computing %s for %d values of n_clusters and n_samples=%d\"\n          % (score_func.__name__, len(n_clusters_range), n_samples))\n\n    t0 = time()\n    scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range)\n    print(\"done in %0.3fs\" % (time() - t0))\n    plots.append(plt.errorbar(\n        n_clusters_range, np.median(scores, axis=1), scores.std(axis=1))[0])\n    names.append(score_func.__name__)\n\nplt.title(\"Clustering measures for 2 random uniform labelings\\n\"\n          \"with equal number of clusters\")\nplt.xlabel('Number of clusters (Number of samples is fixed to %d)' % n_samples)\nplt.ylabel('Score value')\nplt.legend(plots, names)\nplt.ylim(bottom=-0.05, top=1.05)\n\n\n# Random labeling with varying n_clusters against ground class labels\n# with fixed number of clusters\n\nn_samples = 1000\nn_clusters_range = np.linspace(2, 100, 10).astype(np.int)\nn_classes = 10\n\nplt.figure(2)\n\nplots = []\nnames = []\nfor score_func in score_funcs:\n    print(\"Computing %s for %d values of n_clusters and n_samples=%d\"\n          % (score_func.__name__, len(n_clusters_range), n_samples))\n\n    t0 = time()\n    scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range,\n                                      fixed_n_classes=n_classes)\n    print(\"done in %0.3fs\" % (time() - t0))\n    plots.append(plt.errorbar(\n        n_clusters_range, scores.mean(axis=1), scores.std(axis=1))[0])\n    names.append(score_func.__name__)\n\nplt.title(\"Clustering measures for random uniform labeling\\n\"\n          \"against reference assignment with %d classes\" % n_classes)\nplt.xlabel('Number of clusters (Number of samples is fixed to %d)' % n_samples)\nplt.ylabel('Score value')\nplt.ylim(bottom=-0.05, top=1.05)\nplt.legend(plots, names)\nplt.show()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}